---
title: "Supplementary Material"
subtitle: "Krasselt/Dreesen (in press): Topic models indicate textual aboutness and pragmatics: Valuation practices in Islamophobic discourse. Journal of Cultural Analytics"
author: "Julia Krasselt"
date: "6.12.2023"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Load necessary packages

To run the script, the following packages need to be loaded:

```{r packages, message=FALSE}
library(dplyr)
library(ggplot2)
```

Other packages that need to be installed are: data.table, ggpubr, pblapply, ldply, RColorBrewer, plyr, plotrix.

## Read necessary data frames from disk

To run the script, the following data frames are needed:

* Document-Topic-Distribution (mallet-output)
* Topic-Word-Assignment (mallet output)
* topic list with top 3 words per topic

```{r dataframes}
# read df with doc-topic-distribution
doc.topics_df <- data.table::fread("doc_topics_df.csv")
doc.topics_df <- doc.topics_df %>%
  select(-V1)

# read df containing topic word assignment for all documents
topic_word_assignment <- data.table::fread("topic_word_assignment.csv")

# read df containing top 3 words per topics 
top3words_per_topic  <- data.table::fread("top3words_per_topic.csv")
top3words_per_topic$topic <- as.factor(top3words_per_topic$topic)
```

## Get top 3 topics per document

The following functions iterates through each document and extracts the top 3 topics represented in the document.

```{r top3topics, message=FALSE, warning=FALSE}
# function takes each row, sorts the topics and extracts the top n topics
find_top_topics <- function(i) {
  
  # extract row
  tmp_df <- doc.topics_df[i, ] #%>% droplevels.data.frame()
  
  n_topics = 3
  # create list for result
  
  # sort topics
  sorted_topics_top <- tmp_df %>%
    tidyr::pivot_longer(!text_id, names_to = "topic", values_to = "beta") %>%
    arrange(desc(beta)) %>% 
    top_n(3)
  
  #sorted_topics_top <- names(sort(tmp_df[, 1:(ncol(tmp_df)-1)], decreasing = TRUE))[1:n_topics]
  
  # output
  #if (any(sorted_topics_top %in% koran_topic)) {
  return(
    data.frame(
      text_id = as.vector(distinct(sorted_topics_top, text_id)$text_id),
      top_topics = as.vector(sorted_topics_top$topic)
    )
  )
  #}
}

result_top3 <-
  pbapply::pblapply(X = c(1:nrow(doc.topics_df)),
                    FUN = find_top_topics)

top3_topics <- plyr::ldply(.data = result_top3, .progress = "text")

```

## FREQUENCY OF TOPICS IN STUDY CORPUS

The following code creates a barplot visualizing how often each topic is among the top three topics of a document.

```{r}
# count how often a topic is among the top 3 topics in a document
plotdata <- top3_topics %>%
  group_by(top_topics) %>%
  summarize(n = n())

# annotate topic category

plotdata$category = NA
# criticism of German politics and parties
plotdata$category[plotdata$top_topics == "topic_4"] = "criticism of German politics and parties"
# alleged Islam culture and how it contradicts with western culture
tmp <- c("topic_1", "topic_3", "topic_5", "topic_7", "topic_11", "topic_12", "topic_26", "topic_30", "topic_21")
plotdata$category[plotdata$top_topics %in% tmp] = "alleged Islam culture and\n how it contradicts with western culture"
# islam in international contexts
plotdata$category[plotdata$top_topics %in% c("topic_14","topic_29")] = "islam in international contexts"
# activities of Islam-critical movements in Germany
plotdata$category[plotdata$top_topics %in% c("topic_18", "topic_20")] = "activities of Islam-critical\n movements in Germany"
# alleged Islamic antisemitism
plotdata$category[plotdata$top_topics == "topic_22"] = "alleged Islamic antisemitism"
# local & domestic issues
plotdata$category[plotdata$top_topics %in% c("topic_2", "topic_17")] = "local & domestic issues"
# media criticism
plotdata$category[plotdata$top_topics == "topic_23"] = "media criticism"
# on the role of the Quran
plotdata$category[plotdata$top_topics %in% c("topic_8", "topic_27")] = "on the role of the Quran"
# violence and terror with alleged Islamic origin
plotdata$category[plotdata$top_topics %in% c("topic_10", "topic_13", "topic_19", "topic_28")] = "violence and terror with alleged\n Islamic origin"
# quoting
plotdata$category[plotdata$top_topics %in% c("topic_15", "topic_9")] = "quoting"
# hermeneutic process
plotdata$category[plotdata$top_topics %in% c("topic_24", "topic_25", "topic_16", "topic_6")] = "hermeneutic process"

# annotate macrostructure
plotdata$macrostructure = NA

plotdata$macrostructure[plotdata$top_topics %in% c("topic_15", "topic_24", "topic_25", "topic_16", "topic_6", "topic_9")] = "pragmatics indicating topics "
plotdata$macrostructure[is.na(plotdata$macrostructure)] = "aboutness indicating topics "

# adjust topic labels
plotdata$label <- gsub(pattern = "_", replacement = " ", x = plotdata$top_topics)
plotdata$label <- gsub(pattern = "t", replacement = "T", x = plotdata$label)


plotdata$category <- factor(plotdata$category, levels = c("alleged Islam culture and\n how it contradicts with western culture",
                                                            "activities of Islam-critical\n movements in Germany",
                                                            "alleged Islamic antisemitism",
                                                            "criticism of German politics and parties",
                                                            "islam in international contexts",
                                                            "local & domestic issues",
                                                            "media criticism",
                                                            "on the role of the Quran",
                                                            "violence and terror with alleged\n Islamic origin",
                                                            "hermeneutic process",
                                                            "quoting"))

# color palette
colors = RColorBrewer::brewer.pal(n = 12, name = "Set3")
names(colors) <- levels(plotdata$category)
colScale <- scale_colour_manual(name = "categorie",values = colors)

corpus_name = "PI-News, texts with .*koran.*"

# create plot
ggplot(data = plotdata, aes(x = reorder(label, n), y = n, fill = category)) +
  geom_bar(stat="identity", position=position_dodge()) +
  coord_flip() +
  theme_minimal() +
  labs(subtitle= paste("corpus: ", corpus_name, sep = ""), 
       y="\nnumber of texts\n", 
       x="", 
       title= "Number of texts per topic") +
  theme(axis.text.y = element_text(size = 5),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        plot.title = element_text(size = 10),
        plot.subtitle = element_text(size = 8),
        axis.text.x = element_text(size = 5),
        axis.title.x = element_text(size = 11)) +
  facet_wrap(~ macrostructure, ncol = 2, scales = "free_y" ) +
  theme(axis.text.y = element_text(hjust = 0)) +
  scale_fill_brewer(palette="Paired") +
  theme(legend.key.size = unit(0.2, 'cm'),
        legend.text = element_text(size=6),
        legend.title = element_text(size=6))
```

## 3grams SURA CITATIONS

The following code reads in the dataframe with 3grams from sura citations and print the top 50 3grams.

```{r}
# read df containing 3 grams
citations_suras_ngrams <- data.table::fread("citations_suras_3grams.txt")
# top 50 3grams
knitr::kable(citations_suras_ngrams[1:50,], caption = "Top 50 sura 3grams")
```

## TOPIC PATHWAYS

The following code creates plots for topic pathways. A plot shows the pathway of two topics, they need to be specified by the user (topic_a and topic_b). 

```{r message=FALSE}
# filter for texts containing a specific topic combination (adjust manually)
topic_a = "topic_9"
topic_b = "topic_19"

texts_with_topic_a <- top3_topics %>%
  filter(top_topics == topic_a) %>%
  distinct() %>%
  pull(text_id)

texts_with_topic_a_and_b <- top3_topics %>%
  filter(text_id %in% texts_with_topic_a) %>%
  filter(top_topics == topic_b) %>%
  pull(text_id)

# prepare data for plot
plotdata <- topic_word_assignment %>%
  filter(text_id %in% texts_with_topic_a_and_b) %>%
  # remove all NA
  filter(!is.na(.topic)) %>%
  group_by(text_id) %>%
  # annotate token number per document
  mutate(token_id = seq(1, n())) %>%
  # annotate relative token position per document
  mutate(rel_token_pos = token_id / n())

# bin in 10 sections
plotdata$rel_token_pos[which(plotdata$rel_token_pos == 1)] <- 0.9999
plotdata$bin = plyr::round_any(plotdata$rel_token_pos, accuracy = 0.1, f = floor)

# calculate topic proportion per bin per document
plotdata$.topic <- as.factor(paste(plotdata$.topic))
plotdata <- plotdata %>%
  # 1. count terms per topic per document and exclude topics assigned to less than ten words in a document)
  group_by(text_id,.topic) %>% 
  mutate(topic_n_in_text = n()) %>%
  filter(topic_n_in_text > 10) %>%
  # 2. count terms per topic per bin per document
  group_by(text_id, bin, .topic) %>% 
  summarise(topic_n_in_bin = n()) %>%
  # 3. count terms per bin per document and calculate topic proportion per bin
  group_by(text_id, bin) %>%
  mutate(words_per_bin = sum(topic_n_in_bin)) %>%
  mutate(topic_proportion_in_bin = topic_n_in_bin / words_per_bin) %>% 
  # 4. calculate mean topic proportion and SD per bin
  group_by(bin, .topic) %>% 
  summarise(
    topic_proportion_in_bin_grand_mean = mean(topic_proportion_in_bin),
    topic_proportion_in_bin_SE = plotrix::std.error(topic_proportion_in_bin))

# create column containing the top 3 words of topic (used as a label in the plot)
plotdata <- left_join(plotdata, top3words_per_topic, by = c(".topic" = "topic"))
plotdata$topWords = paste("Topic ", plotdata$.topic, ": ", plotdata$topWords, sep = "")

plotdata$.topic <- paste("topic_", plotdata$.topic, sep = "")

# filter plotdata for the two topics a and b
plotdata <- plotdata %>% 
  filter(.topic %in% c(topic_a, topic_b))

# create plot caption
caption = paste("corpus: PI-News; TM on texts containing '.*Koran.*'\ntexts containing ",
                topic_a,
                " (blue) and ",
                topic_b,
                " (red); n = ",
                length(texts_with_topic_a_and_b),
                sep = "")

ggplot(data=plotdata, aes(x=bin, y=topic_proportion_in_bin_grand_mean, group=topWords, color = topWords)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) +
  scale_x_continuous(breaks=seq(0, 1, 0.1)) +
  labs(caption= caption, 
       y="mean topic proportion per bin\n", 
       x="\nbin (position in text)") +
  theme(
    legend.background = element_rect(fill="gray98", size=.4),
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    plot.title = element_text(size = 20),
    plot.subtitle = element_text(size = 15),
    axis.title.x = element_text(size = 10),
    axis.title.y = element_text(size = 10)) +
  theme(legend.position = "none")

# combine multiple plots
#library(ggpubr)
#ggarrange(graph1, graph2, graph3, graph4,
#          labels = c("A", "B", "C", "D"),
#          ncol = 2, nrow = 2)
```

